PCA for wecare-only (b37) wes dataset w/o Danish:
References:
# Time
Sys.time()
## [1] "2021-01-22 23:49:59 GMT"
# Memory
gc()
## used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells 411912 22.0 856132 45.8 NA 641517 34.3
## Vcells 789331 6.1 8388608 64.0 16384 1768563 13.5
# Clean up
rm(list=ls())
graphics.off()
# Options
options(stringsAsFactors = F)
# Working folders
base_folder <- "/Users/alexey/Documents" # mac
project_folder <- file.path(base_folder,"wecare","final_analysis_2021","reanalysis_wo_danish_2021") # mac
scripts_folder <- file.path(project_folder,"scripts","s08_pca")
setwd(scripts_folder)
data_folder <- file.path(project_folder,"data","s08_pca")
library(bigsnpr) # for bed_autoSVD() and bed()
## Loading required package: bigstatsr
library(bigutilsr) # for prob_dist() and tukey_mc_up() for outlier detection
library(hexbin) # for plotting svd loadings
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
NCORES <- 1
#NCORES <- nb_cores() # 2
# Location of bed file
bed_file <- file.path(data_folder,"s03_non_related","common_hwe_norel.bed")
# Attach PLINK data to R environment
wecare.bed <- bed(bed_file) # bigsnpr::bed
# Explore wecare.bed
wecare.bed
## A 'bed' object with 335 samples and 39628 variants.
attributes(wecare.bed)
## $.xData
## <environment: 0x7f8ec82a9668>
##
## $class
## [1] "bed"
## attr(,"package")
## [1] "bigsnpr"
str(wecare.bed)
## Reference class 'bed' [package "bigsnpr"] with 13 fields
## $ bedfile: chr "/Users/alexey/Documents/wecare/final_analysis_2021/reanalysis_wo_danish_2021/data/s08_pca/s03_non_related/common_hwe_norel.bed"
## $ extptr :<externalptr>
## $ .fam :'data.frame': 335 obs. of 6 variables:
## ..$ family.ID : chr [1:335] "P1_A01" "P1_A02" "P1_A03" "P1_A04" ...
## ..$ sample.ID : chr [1:335] "P1_A01" "P1_A02" "P1_A03" "P1_A04" ...
## ..$ paternal.ID: int [1:335] 0 0 0 0 0 0 0 0 0 0 ...
## ..$ maternal.ID: int [1:335] 0 0 0 0 0 0 0 0 0 0 ...
## ..$ sex : int [1:335] 2 2 2 2 2 2 2 2 2 2 ...
## ..$ affection : int [1:335] 2 2 2 2 2 2 1 2 2 1 ...
## $ .map :'data.frame': 39628 obs. of 6 variables:
## ..$ chromosome : int [1:39628] 1 1 1 1 1 1 1 1 1 1 ...
## ..$ marker.ID : chr [1:39628] "1_881627_G_A" "1_897325_G_C" "1_897564_T_C" "1_897738_C_T" ...
## ..$ genetic.dist: int [1:39628] 0 0 0 0 0 0 0 0 0 0 ...
## ..$ physical.pos: int [1:39628] 881627 897325 897564 897738 911916 982941 982994 986443 987200 990380 ...
## ..$ allele1 : chr [1:39628] "G" "G" "T" "T" ...
## ..$ allele2 : chr [1:39628] "A" "C" "C" "C" ...
## $ prefix : chr "/Users/alexey/Documents/wecare/final_analysis_2021/reanalysis_wo_danish_2021/data/s08_pca/s03_non_related/common_hwe_norel"
## $ bimfile: chr "/Users/alexey/Documents/wecare/final_analysis_2021/reanalysis_wo_danish_2021/data/s08_pca/s03_non_related/common_hwe_norel.bim"
## $ famfile: chr "/Users/alexey/Documents/wecare/final_analysis_2021/reanalysis_wo_danish_2021/data/s08_pca/s03_non_related/common_hwe_norel.fam"
## $ fam :'data.frame': 335 obs. of 6 variables:
## ..$ family.ID : chr [1:335] "P1_A01" "P1_A02" "P1_A03" "P1_A04" ...
## ..$ sample.ID : chr [1:335] "P1_A01" "P1_A02" "P1_A03" "P1_A04" ...
## ..$ paternal.ID: int [1:335] 0 0 0 0 0 0 0 0 0 0 ...
## ..$ maternal.ID: int [1:335] 0 0 0 0 0 0 0 0 0 0 ...
## ..$ sex : int [1:335] 2 2 2 2 2 2 2 2 2 2 ...
## ..$ affection : int [1:335] 2 2 2 2 2 2 1 2 2 1 ...
## $ map :'data.frame': 39628 obs. of 6 variables:
## ..$ chromosome : int [1:39628] 1 1 1 1 1 1 1 1 1 1 ...
## ..$ marker.ID : chr [1:39628] "1_881627_G_A" "1_897325_G_C" "1_897564_T_C" "1_897738_C_T" ...
## ..$ genetic.dist: int [1:39628] 0 0 0 0 0 0 0 0 0 0 ...
## ..$ physical.pos: int [1:39628] 881627 897325 897564 897738 911916 982941 982994 986443 987200 990380 ...
## ..$ allele1 : chr [1:39628] "G" "G" "T" "T" ...
## ..$ allele2 : chr [1:39628] "A" "C" "C" "C" ...
## $ nrow : int 335
## $ ncol : int 39628
## $ light :Reference class 'bed_light' [package "bigsnpr"] with 5 fields
## ..$ bedfile: chr "/Users/alexey/Documents/wecare/final_analysis_2021/reanalysis_wo_danish_2021/data/s08_pca/s03_non_related/common_hwe_norel.bed"
## ..$ nrow : int 335
## ..$ ncol : int 39628
## ..$ extptr :<externalptr>
## ..$ address:<externalptr>
## ..and 15 methods, of which 1 is possibly relevant:
## .. initialize
## $ address:<externalptr>
## and 16 methods, of which 2 are possibly relevant:
## initialize, show#envRefClass
wecare.bed$bedfile
## [1] "/Users/alexey/Documents/wecare/final_analysis_2021/reanalysis_wo_danish_2021/data/s08_pca/s03_non_related/common_hwe_norel.bed"
wecare.bed$address
## <pointer: 0x7f8ec148a6c0>
# Clean-up
rm(bed_file)
wecare_fam.df <- wecare.bed$fam
dim(wecare_fam.df)
## [1] 335 6
head(wecare_fam.df)
## family.ID sample.ID paternal.ID maternal.ID sex affection
## 1 P1_A01 P1_A01 0 0 2 2
## 2 P1_A02 P1_A02 0 0 2 2
## 3 P1_A03 P1_A03 0 0 2 2
## 4 P1_A04 P1_A04 0 0 2 2
## 5 P1_A05 P1_A05 0 0 2 2
## 6 P1_A06 P1_A06 0 0 2 2
# Phenotypes from external file
load(file.path(project_folder,"data","s06_filter","s04_filter_by_sample_call_rates.RData"))
rm(genotypes.mx,variants.df)
project_folder <- file.path(base_folder,"wecare","final_analysis_2021","reanalysis_wo_danish_2021") # mac
scripts_folder <- file.path(project_folder,"scripts","s08_pca")
data_folder <- file.path(project_folder,"data","s08_pca")
dim(phenotypes.df)
## [1] 335 40
colnames(phenotypes.df)
## [1] "wes_id" "gwas_id" "merged_id" "filter"
## [5] "cc" "setno" "registry" "family_history"
## [9] "age_dx" "age_ref" "rstime" "eig1_gwas"
## [13] "eig2_gwas" "eig3_gwas" "eig4_gwas" "eig5_gwas"
## [17] "stage" "er" "pr" "hist_cat"
## [21] "hormone" "chemo_cat" "br_xray" "br_xray_dose"
## [25] "age_menarche" "age_1st_ftp" "age_menopause" "num_preg"
## [29] "bmi_age18" "bmi_dx" "bmi_ref" "eig1_wecare"
## [33] "eig2_wecare" "eig3_wecare" "eig4_wecare" "eig5_wecare"
## [37] "Batch_ID" "Subject_ID" "Barcode" "danish"
# Merge fam-file and phenotypes from external file
wecare_phenotypes.df <- left_join(wecare_fam.df, phenotypes.df,
by=c("sample.ID"="wes_id"))
dim(wecare_phenotypes.df)
## [1] 335 45
colnames(wecare_phenotypes.df)
## [1] "family.ID" "sample.ID" "paternal.ID" "maternal.ID"
## [5] "sex" "affection" "gwas_id" "merged_id"
## [9] "filter" "cc" "setno" "registry"
## [13] "family_history" "age_dx" "age_ref" "rstime"
## [17] "eig1_gwas" "eig2_gwas" "eig3_gwas" "eig4_gwas"
## [21] "eig5_gwas" "stage" "er" "pr"
## [25] "hist_cat" "hormone" "chemo_cat" "br_xray"
## [29] "br_xray_dose" "age_menarche" "age_1st_ftp" "age_menopause"
## [33] "num_preg" "bmi_age18" "bmi_dx" "bmi_ref"
## [37] "eig1_wecare" "eig2_wecare" "eig3_wecare" "eig4_wecare"
## [41] "eig5_wecare" "Batch_ID" "Subject_ID" "Barcode"
## [45] "danish"
# Make sure that dplyr::left_joint hasnt changed the order of samples
sum(substr(wecare_phenotypes.df$merged_id,1,6) != wecare_fam.df$sample.ID)
## [1] 0
# Add column fopr outliers
wecare_phenotypes.df <- data.frame(wecare_phenotypes.df,outlier=F)
# Clean-up
rm(phenotypes.df, wecare_fam.df)
# map file
wecare_map.df <- wecare.bed$map
dim(wecare_map.df)
## [1] 39628 6
head(wecare_map.df)
## chromosome marker.ID genetic.dist physical.pos allele1 allele2
## 1 1 1_881627_G_A 0 881627 G A
## 2 1 1_897325_G_C 0 897325 G C
## 3 1 1_897564_T_C 0 897564 T C
## 4 1 1_897738_C_T 0 897738 T C
## 5 1 1_911916_C_T 0 911916 T C
## 6 1 1_982941_T_C 0 982941 T C
# make simple counts
wecare_maf.df <- bed_MAF(wecare.bed)
dim(wecare_maf.df)
## [1] 39628 5
head(wecare_maf.df)
## ac mac af maf N
## 1 196 196 0.33793103 0.33793103 290
## 2 42 42 0.07266436 0.07266436 289
## 3 43 43 0.06847134 0.06847134 314
## 4 41 41 0.07093426 0.07093426 289
## 5 85 85 0.12724551 0.12724551 334
## 6 61 61 0.09327217 0.09327217 327
# merge map file with the counts
wecare_variants.df <- cbind(wecare_map.df,wecare_maf.df)
dim(wecare_variants.df)
## [1] 39628 11
head(wecare_variants.df)
## chromosome marker.ID genetic.dist physical.pos allele1 allele2 ac mac
## 1 1 1_881627_G_A 0 881627 G A 196 196
## 2 1 1_897325_G_C 0 897325 G C 42 42
## 3 1 1_897564_T_C 0 897564 T C 43 43
## 4 1 1_897738_C_T 0 897738 T C 41 41
## 5 1 1_911916_C_T 0 911916 T C 85 85
## 6 1 1_982941_T_C 0 982941 T C 61 61
## af maf N
## 1 0.33793103 0.33793103 290
## 2 0.07266436 0.07266436 289
## 3 0.06847134 0.06847134 314
## 4 0.07093426 0.07093426 289
## 5 0.12724551 0.12724551 334
## 6 0.09327217 0.09327217 327
# Variants with AF(ref) < AF(alt)
sum(wecare_variants.df$ac != wecare_variants.df$mac)
## [1] 0
# Clean-up
rm(wecare_map.df, wecare_maf.df)
Takes care about LD etc.
See ?plot.big_SVD for plotting svd objets.
# Get indices of non-outliers in format required by bed_autoSVD
# (integer indices, indicating row numbers)
non_outliers1 <- which(!wecare_phenotypes.df$outlier)
length(non_outliers1)
## [1] 335
# bigsnpr::bed_autoSVD, Default k = 10
#using all samples (ind.row) and variants (ind.col)
#table(wecare.bed$map$chromosome) - if complains abotut non-numeric chromosomes
wecare.svd1 <- bed_autoSVD(wecare.bed,
ind.row=non_outliers1,
ncores = NCORES)
##
## Phase of clumping (on MAC) at r^2 > 0.2.. keep 17837 variants.
## Discarding 0 variant with MAC < 10.
##
## Iteration 1:
## Computing SVD..
## 128 outlier variants detected..
## 1 long-range LD region detected..
##
## Iteration 2:
## Computing SVD..
## 0 outlier variant detected..
##
## Converged!
#ind.col=vars_not_in_LD,
# Variants not in LD (detected by clumping during autoSVD)
vars_not_in_LD1 <- attr(wecare.svd1, "subset")
length(vars_not_in_LD1)
## [1] 17709
#attributes(wecare.svd)
str(wecare.svd1)
## List of 7
## $ d : num [1:10] 196 183 152 152 151 ...
## $ u : num [1:335, 1:10] 0.1363 0.1451 -0.0121 -0.0211 -0.0259 ...
## $ v : num [1:17709, 1:10] 0.000921 -0.005054 0.004853 0.00721 -0.009297 ...
## $ niter : num 15
## $ nops : num 244
## $ center: num [1:17709] 0.676 0.137 0.254 0.211 0.511 ...
## $ scale : num [1:17709] 0.669 0.357 0.471 0.435 0.617 ...
## - attr(*, "class")= chr "big_SVD"
## - attr(*, "subset")= int [1:17709] 1 3 5 9 10 11 12 15 16 20 ...
## - attr(*, "lrldr")='data.frame': 1 obs. of 3 variables:
## ..$ Chr : num 6
## ..$ Start: num 32796751
## ..$ Stop : num 42650772
# Eigenvalues
length(wecare.svd1$d)
## [1] 10
wecare.svd1$d
## [1] 195.8245 182.9719 151.7824 151.5087 151.1509 150.7539 150.6255 150.5100
## [9] 150.1112 149.5871
plot(wecare.svd1) # default type="screeplot" see ?plot.big_SVD
## Warning: Continuous limits supplied to discrete scale.
## Did you mean `limits = factor(...)` or `scale_*_continuous()`?
# Eigenvectors
dim(wecare.svd1$u)
## [1] 335 10
head(wecare.svd1$u)
## [,1] [,2] [,3] [,4] [,5]
## [1,] 0.13630917 0.105728563 -0.058869640 0.053270057 -0.071375682
## [2,] 0.14509744 0.060019838 0.009664619 -0.036790376 0.004449234
## [3,] -0.01212106 -0.003851353 -0.028351214 0.034059416 0.084183131
## [4,] -0.02109911 0.018570166 0.071605066 -0.005023272 0.002584699
## [5,] -0.02591967 -0.028676699 0.072597633 0.010689744 0.060390468
## [6,] -0.01649853 -0.012722733 0.023779948 0.053496518 -0.081919467
## [,6] [,7] [,8] [,9] [,10]
## [1,] -0.083141571 0.09325142 -0.054764636 0.029491944 0.028048716
## [2,] 0.004385738 -0.02111137 0.008643476 -0.004200611 0.063309640
## [3,] -0.068367578 0.01421584 0.008810475 0.001191556 -0.031455299
## [4,] 0.030187082 -0.05529577 0.003396932 0.070203572 0.032446894
## [5,] -0.022477192 0.07267845 0.064590228 -0.001438803 0.003509294
## [6,] 0.008744997 0.03610154 -0.024077602 0.141481805 -0.053570862
# PCA summary (for PCs from 1 to 10)
plot(wecare.svd1,type = "scores",scores=1:10,coeff=0.4)
# Loadings
dim(wecare.svd1$v)
## [1] 17709 10
head(wecare.svd1$v)
## [,1] [,2] [,3] [,4] [,5]
## [1,] 0.000920977 -0.0008277551 0.0005237386 -0.008730989 0.009041437
## [2,] -0.005053747 0.0044461637 0.0027742776 -0.010116289 -0.002725499
## [3,] 0.004853187 0.0028474205 -0.0051168462 -0.012379625 0.013007407
## [4,] 0.007210244 0.0021103606 -0.0011785656 0.006364635 0.009475204
## [5,] -0.009296606 0.0052304251 0.0013548087 0.001468288 -0.007015240
## [6,] 0.009219696 -0.0192255697 -0.0082900970 0.014792149 0.002904332
## [,6] [,7] [,8] [,9] [,10]
## [1,] 0.005177317 0.006716877 -0.014870474 -0.0077179427 0.0077299110
## [2,] -0.005139297 -0.006675946 -0.002947939 0.0030950364 0.0003375347
## [3,] 0.010588261 0.004296047 -0.014231075 -0.0146200084 -0.0028576898
## [4,] 0.006088027 0.006850451 0.001843243 0.0025879821 -0.0043882490
## [5,] -0.002124761 -0.012870763 -0.002623696 -0.0000418261 -0.0001034652
## [6,] -0.008097585 -0.012913949 0.017080498 0.0004770108 -0.0125200209
# Loadings summary (for PCs from 1 to 10)
plot(wecare.svd1,type="loadings",loadings=1:10,coeff=0.4)
# Calculate a measure of "outlieness"
U1 <- wecare.svd1$u
prob1 <- prob_dist(U1, ncores=NCORES) # bigutilsr::prob_dist
S1 <- prob1$dist.self / sqrt(prob1$dist.nn)
tukey_threshold1 <- tukey_mc_up(S1) # bigutilsr::tukey_mc_up
# Outliers
outliers1 <- S1 >= tukey_threshold1
sum(outliers1)
## [1] 1
outliers_id1 <- wecare.bed$fam$sample.ID[S1 >= tukey_threshold1]
outliers_id1
## [1] "P6_D05"
# Histogram by "outlieness" score
ggplot() +
geom_histogram(aes(S1), color = "black", fill = "blue", alpha = 0.3) +
theme_bigstatsr() +
geom_vline(xintercept=tukey_threshold1, colour="red") +
labs(x = "Statistic of outlierness (S)", y = "Frequency (sqrt-scale)")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# Location of outlier(s) in PCA plots
plot(U1[, 1:2], col = (S1 > tukey_threshold1) + 1, pch = 20)
plot(U1[, 3:4], col = (S1 > tukey_threshold1) + 1, pch = 20)
plot(U1[, 5:6], col = (S1 > tukey_threshold1) + 1, pch = 20)
# Add outlier to the phenotypes data frame
wecare_phenotypes.df$outlier <- wecare_phenotypes.df$outlier |
wecare_phenotypes.df$sample.ID %in% outliers_id1
sum(wecare_phenotypes.df$outlier)
## [1] 1
# Clean-up
rm(non_outliers1,U1,prob1,S1,tukey_threshold1,outliers_id1,
outliers1,wecare.svd1) #vars_not_in_LD1
Re-run w/o previously detected outlier(s)
Using previously calculated vars_not_in_LD to speed-up
https://privefl.github.io/bigsnpr/articles/bedpca.html
# Get indices of non-outliers in format required by bed_autoSVD
# (integer indices, indicating row numbers)
non_outliers2 <- which(!wecare_phenotypes.df$outlier)
length(non_outliers2)
## [1] 334
# Calculate PCA
wecare.svd2 <- bed_autoSVD(wecare.bed,
ind.row=non_outliers2,
ind.col=vars_not_in_LD1,
ncores = NCORES)
##
## Phase of clumping (on MAC) at r^2 > 0.2.. keep 17677 variants.
## Discarding 0 variant with MAC < 10.
##
## Iteration 1:
## Computing SVD..
## 0 outlier variant detected..
##
## Converged!
# ind.col=vars_not_in_LD1 - removes the outlier
# Variants not in LD (detected by clumping during autoSVD)
#vars_not_in_LD2 <- attr(wecare.svd2, "subset")
#ength(vars_not_in_LD2)
# Explore PCA results
plot(wecare.svd2)
## Warning: Continuous limits supplied to discrete scale.
## Did you mean `limits = factor(...)` or `scale_*_continuous()`?
plot(wecare.svd2, type = "loadings", loadings=1:10, coeff=0.4)
plot(wecare.svd2,type = "scores",scores=1:10,coeff=0.4)
# Calculate a measure of "outlieness"
U2 <- wecare.svd2$u
prob2 <- prob_dist(U2, ncores=NCORES) # bigutilsr::prob_dist
S2 <- prob2$dist.self / sqrt(prob2$dist.nn)
tukey_threshold2 <- tukey_mc_up(S2) # bigutilsr::tukey_mc_up
# Outliers
outliers2 <- S2 >= tukey_threshold2
sum(outliers2)
## [1] 1
outliers_id2 <- wecare.bed$fam$sample.ID[S2 >= tukey_threshold2]
outliers_id2
## [1] "P5_E09"
# Histogram by "outlieness" score
ggplot() +
geom_histogram(aes(S2), color = "black", fill = "blue", alpha = 0.3) +
theme_bigstatsr() +
geom_vline(xintercept=tukey_threshold2, colour="red") +
labs(x = "Statistic of outlierness (S)", y = "Frequency (sqrt-scale)")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# Location of outlier(s) in PCA plots
plot(U2[, 1:2], col = (S2 > tukey_threshold2) + 1, pch = 20)
plot(U2[, 3:4], col = (S2 > tukey_threshold2) + 1, pch = 20)
plot(U2[, 5:6], col = (S2 > tukey_threshold2) + 1, pch = 20)
# Add outlier to the phenotypes data frame
wecare_phenotypes.df$outlier <- wecare_phenotypes.df$outlier |
wecare_phenotypes.df$sample.ID %in% outliers_id2
sum(wecare_phenotypes.df$outlier)
## [1] 2
# Clean-up
rm(non_outliers2,U2,prob2,S2,tukey_threshold2,outliers_id2,
outliers2,wecare.svd2) # vars_not_in_LD2
Re-run w/o previously detected outlier(s)
Using previously calculated vars_not_in_LD to speed-up
https://privefl.github.io/bigsnpr/articles/bedpca.html
# Get indices of non-outliers in format required by bed_autoSVD
# (integer indices, indicating row numbers)
non_outliers3 <- which(!wecare_phenotypes.df$outlier)
length(non_outliers3)
## [1] 333
# Calculate PCA
wecare.svd3 <- bed_autoSVD(wecare.bed,
ind.row=non_outliers3,
ind.col=vars_not_in_LD1,
ncores = NCORES)
##
## Phase of clumping (on MAC) at r^2 > 0.2.. keep 17653 variants.
## Discarding 0 variant with MAC < 10.
##
## Iteration 1:
## Computing SVD..
## 0 outlier variant detected..
##
## Converged!
# Explore PCA result
plot(wecare.svd3)
## Warning: Continuous limits supplied to discrete scale.
## Did you mean `limits = factor(...)` or `scale_*_continuous()`?
plot(wecare.svd3, type = "loadings", loadings=1:10, coeff=0.4)
plot(wecare.svd3,type = "scores",scores=1:10,coeff=0.4)
# Calculate a measure of "outlieness"
U3 <- wecare.svd3$u
prob3 <- prob_dist(U3, ncores=NCORES) # bigutilsr::prob_dist
S3 <- prob3$dist.self / sqrt(prob3$dist.nn)
tukey_threshold3 <- tukey_mc_up(S3) # bigutilsr::tukey_mc_up
# Outliers
outliers3 <- S3 >= tukey_threshold3
sum(outliers3)
## [1] 0
# Histogram by "outlieness" score
ggplot() +
geom_histogram(aes(S3), color = "black", fill = "blue", alpha = 0.3) +
theme_bigstatsr() +
geom_vline(xintercept=tukey_threshold3, colour="red") +
labs(x = "Statistic of outlierness (S)", y = "Frequency (sqrt-scale)")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# Clean-up
# Clean-up
rm(non_outliers3,U3,prob3,S3,tukey_threshold3,outliers3) # vars_not_in_LD2
updated_phenotypes.df <- wecare_phenotypes.df[!wecare_phenotypes.df$outlier,]
dim(updated_phenotypes.df)
## [1] 333 46
eigenvectors.mx <- wecare.svd3$u
dim(eigenvectors.mx)
## [1] 333 10
colnames(eigenvectors.mx) <-
c("pc1","pc2","pc3","pc4","pc5","pc6","pc7","pc8","pc9","pc10")
updated_phenotypes.df <- cbind(updated_phenotypes.df, eigenvectors.mx)
dim(updated_phenotypes.df)
## [1] 333 56
colnames(updated_phenotypes.df)
## [1] "family.ID" "sample.ID" "paternal.ID" "maternal.ID"
## [5] "sex" "affection" "gwas_id" "merged_id"
## [9] "filter" "cc" "setno" "registry"
## [13] "family_history" "age_dx" "age_ref" "rstime"
## [17] "eig1_gwas" "eig2_gwas" "eig3_gwas" "eig4_gwas"
## [21] "eig5_gwas" "stage" "er" "pr"
## [25] "hist_cat" "hormone" "chemo_cat" "br_xray"
## [29] "br_xray_dose" "age_menarche" "age_1st_ftp" "age_menopause"
## [33] "num_preg" "bmi_age18" "bmi_dx" "bmi_ref"
## [37] "eig1_wecare" "eig2_wecare" "eig3_wecare" "eig4_wecare"
## [41] "eig5_wecare" "Batch_ID" "Subject_ID" "Barcode"
## [45] "danish" "outlier" "pc1" "pc2"
## [49] "pc3" "pc4" "pc5" "pc6"
## [53] "pc7" "pc8" "pc9" "pc10"
# Check consistency with the previous eigenvectors from WES
plot(updated_phenotypes.df$eig1_wecare,
updated_phenotypes.df$pc1,main="PC1: new vs old WES")
plot(updated_phenotypes.df$eig2_wecare,
updated_phenotypes.df$pc2,main="PC2: new vs old WES")
# Check consistency with the previous eigenvectors from GWAS
plot(updated_phenotypes.df$eig1_gwas,
updated_phenotypes.df$pc1,main="PC1: new WES vs GWAs")
plot(updated_phenotypes.df$eig2_gwas,
updated_phenotypes.df$pc2,main="PC2: new WES vs GWAs")
# Clean-up
rm(wecare_phenotypes.df, eigenvectors.mx, vars_not_in_LD1)
#somehow close wecare.bed ?
plot(wecare.svd3, type = "scores") +
aes(color = updated_phenotypes.df$affection == 2) +
labs(title = NULL, color = "Case")
plot(wecare.svd3, type = "scores", scores=3:4) +
aes(color = updated_phenotypes.df$affection == 2) +
labs(title = NULL, color = "Case")
plot(wecare.svd3, type = "scores", scores=5:6) +
aes(color = updated_phenotypes.df$affection == 2) +
labs(title = NULL, color = "Case")
plot(wecare.svd3, type = "scores", scores=7:8) +
aes(color = updated_phenotypes.df$affection == 2) +
labs(title = NULL, color = "Case")
plot(wecare.svd3, type = "scores", scores=9:10) +
aes(color = updated_phenotypes.df$affection == 2) +
labs(title = NULL, color = "Case")
save.image(file.path(data_folder,"s04_calculate_PCs.RData"))
save(updated_phenotypes.df,file=file.path(data_folder,"s04_phenotypes_with_PCs.RData"))
ls()
## [1] "base_folder" "data_folder" "NCORES"
## [4] "project_folder" "scripts_folder" "updated_phenotypes.df"
## [7] "wecare_variants.df" "wecare.bed" "wecare.svd3"
Sys.time()
## [1] "2021-01-22 23:50:53 GMT"
gc()
## used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells 2157082 115.3 5738170 306.5 NA 7172712 383.1
## Vcells 6509215 49.7 52484708 400.5 16384 128129254 977.6